/*

Copyright (C) 1997,1998,1999,2000,2001  Franz Josef Och

mkcls - a program for making word classes .

This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
USA.

*/




#include <cstdlib>
#include "KategProblem.h"

static int oneFreqCompareSteigend(const void *p,const void *j)
{
#ifdef FREQTYPE_DOUBLE
  if( (((OneFreq *)p)->n < ((OneFreq *)j)->n) )
    return -1;
  if( (((OneFreq *)p)->n > ((OneFreq *)j)->n) )
    return +1;
  else
    return 0;
#else
  return ((OneFreq *)p)->n - ((OneFreq *)j)->n;
#endif
}
static int oneFreqCompareFallend(const void *p,const void *j)
{
#ifdef FREQTYPE_DOUBLE
  if( (((OneFreq *)p)->n > ((OneFreq *)j)->n) )
    return -1;
  if( (((OneFreq *)p)->n < ((OneFreq *)j)->n) )
    return +1;
  else
    return 0;
#else
  return -((OneFreq *)p)->n + ((OneFreq *)j)->n;
#endif
}


KategProblemWBC::KategProblemWBC(int n,int minw)
  : _n1(n,0),_n2(n,0),with_h_of_words(0),afterFilled(n,0),beforeFilled(n,0),filled(0),fixedWord(n,-1),absteigend(0),nWords(n),nTranspWords(0),
    mindestAnzahl(minw),after(n),before(n),minIndex(n,-1),maxIndex(n,-1)

{
}

KategProblemWBC::~KategProblemWBC()

{
  massert( after.size()==nWords);
  if( absteigend )
    delete absteigend;
}

void KategProblemWBC::init(int specialFixedWord)
{

  nTranspWords=0;
  int i;
  for(i=0; i<_n1.size(); i++) {
    if( (_n1[i]<mindestAnzahl && _n2[i]<mindestAnzahl && minIndex[i]<=1) ||i==specialFixedWord ) {

      if(!( fixedWord[i]==1 || fixedWord[i]== -1))
        cerr << "mkcls:KategProblemWBC::init::ERROR: " << i << " " << fixedWord[i] << endl;
      fixedWord[i]=1;
    } else if(fixedWord[i]<0)
      nTranspWords++;
  }
  if( absteigend==0 )
    absteigend= &(getSortedList(0));





  if(verboseMode && nTranspWords!=_n1.size()-1 )
    cout << "Es sind: " <<nTranspWords<<" transportierbar.\n";
}

void KategProblemWBC::set_h_of_words(double s)

{
  with_h_of_words=1;
  h_of_words = -s;
}

double KategProblemWBC::get_h_of_words()

{
  if( with_h_of_words )
    return -h_of_words;
  else {
    h_of_words=0;
    for(int i=0; i<nWords; i++)
      h_of_words+=0.5*(kat_h(_n2[i])+kat_h(_n1[i]));
    with_h_of_words=1;
    return -h_of_words;
  }
}


void KategProblemWBC::setAfterWords(int w,int anzahl)

{
  OneFreq o;
  o.w=-1;
  o.n=0;
  afterFilled[w]=0;
  after[w].init(anzahl,o,1);
}
void KategProblemWBC::setBeforeWords(int w,int anzahl)

{
  OneFreq o;
  o.w=-1;
  o.n=0;
  beforeFilled[w]=0;
  before[w].init(anzahl,o,1);
}


void KategProblemWBC::setFreq(int w1,int w2,FreqType anzahl)

{
  OneFreq o;
  o.n=anzahl;

  o.w=w2;
  after[w1][afterFilled[w1]++]=o;
  _n1[w1]+=anzahl;
  o.w=w1;
  before[w2][beforeFilled[w2]++]=o;
  _n2[w2]+=anzahl;
}

void KategProblemWBC::addFreq(int w1,int w2,FreqType anzahl)

{
  OneFreq o;
  o.n=anzahl;
  int pos=-1,i;
  for(i=0; i<afterFilled[w1]; i++)
    if(after[w1][i].w==w2)
      pos=i;

  if(pos==-1) {
    o.w=w2;
    after[w1][afterFilled[w1]++]=o;
  } else
    after[w1][pos].n+=anzahl;
  _n1[w1]+=anzahl;

  pos=-1;
  for(i=0; i<beforeFilled[w2]; i++)
    if(before[w2][i].w==w1)
      pos=i;
  if(pos==-1) {
    o.w=w1;
    before[w2][beforeFilled[w2]++]=o;
  } else
    before[w2][pos].n+=anzahl;
  _n2[w2]+=anzahl;
}


short KategProblemWBC::testFull(int doIt)

{
  int enaNom=0;
  int afterFilledSum=0,beforeFilledSum=0;
  int ret=1,i;
  for(i=0; i<nWords; i++) {
    if( n1(i)==1 && n2(i)==1 )
      enaNom++;
    afterFilledSum+=afterFilled[i];
    beforeFilledSum+=beforeFilled[i];
    if(afterFilled[i]!=after[i].size()) {
      ret=0;
      if( doIt )
        after[i].resize(afterFilled[i]);
    }
    if(beforeFilled[i]!=before[i].size()) {
      ret=0;
      if( doIt )
        before[i].resize(beforeFilled[i]);
    }

  }
  if( ret==0 && !doIt ) {
    cerr << "Error: Unfilled word bigram statistics.\n";
    exit(1);
  } else
    filled=1;
  if( verboseMode>1 ) {
    cout << "MEAN(|L(w)|+|R(w)|)=" << (beforeFilledSum/(float)nWords)
         +(afterFilledSum/(float)nWords) << endl;
    cout << "Hapaslegomena: " << enaNom << endl;
  }
  int symmetrisch=1;
  for(i=0; i<nWords; i++) {
    int j;
    massert(before[i].size()==beforeFilled[i]);
    massert( after[i].size()== afterFilled[i]);
    FreqType sum=0;
    for(j=0; j<after[i].size(); j++)
      sum+=after[i][j].n;
    massert( sum==_n1[i] );
    sum=0;
    for(j=0; j<before[i].size(); j++)
      sum+=before[i][j].n;
    massert(sum==_n2[i]);
    if(_n1[i]!=_n2[i]) {
      symmetrisch=0;
      if( verboseMode>1 )
        cout << "Asymmetrie: " << i << " " << _n1[i] << " " << _n2[i] << endl;
    }

  }
  if(verboseMode && symmetrisch==0)
    cout << "Warning: word bigram statistic is not symmetric "
         "(this is possibly an error)\n";
  return ret;
}

Array<Word> &KategProblemWBC::getSortedList(int steigend)

{
  int siz=_n2.size(),i;
  massert(filled);
  Array<Word> &sortedList =*new Array<Word>(siz);
  Array<OneFreq> list(siz);
  int pos=0;
  for(i=0; i<siz; i++) {
    if( fixedWord[i]<0 ) {
      list[pos].w=i;
      list[pos].n=_n1[i];
      pos++;
    }
  }
  int anzFree=pos;
  for(i=0; i<siz; i++) {
    if( fixedWord[i]>=0 ) {
      list[pos].w=i;
      list[pos].n=_n1[i];
      pos++;
    }
  }
  massert(pos==siz);
  if(steigend )
    qsort(list.getPointerToData(),anzFree,sizeof(OneFreq),oneFreqCompareSteigend);
  else
    qsort(list.getPointerToData(),anzFree,sizeof(OneFreq),oneFreqCompareFallend);
  massert( anzFree<=list.size() );

  for(i=0; i<siz; i++) {
    sortedList[i]=list[i].w;
    massert(steigend || i==0 || i>=anzFree || list[i-1].n>=list[i].n );
    massert((!steigend) || i==0 || i>=anzFree || list[i-1].n<=list[i].n );
  }
  return sortedList;
}

FreqType KategProblemWBC::numberOfWords()

{
  FreqType n1=0,n2=0;
  for(int i=0; i<_n1.size(); i++) {
    n1+=_n1[i];
    n2+=_n2[i];
  }
#ifndef FREQTYPE_DOUBLE
  massert(n1==n2);
#endif
  return n1;
}

void KategProblemWBC::setDollar(int n)

{
  if( fixedWord[n]<0 )
    nTranspWords--;
  fixedWord[n]=0;
}

void KategProblemWBC::initializeIndex(const leda_array<string>&words,char firstChar,int unten,int oben,bool noHapas)
{
  int n=0;
  int i;
  massert(-1<unten);
  massert(unten<oben);
  if( verboseMode )
    cout << "InitializeIndex: " << firstChar << " u:" << unten << " o:" << oben << " " << noHapas << endl;
  over_array(words,i) {
    if( words[i][0]==firstChar && (noHapas || ((short)(n1(i)+0.0001))>=mindestAnzahl || ((short)(n2(i)+0.0001))>=mindestAnzahl) ) {
      minIndex[i]=unten;
      maxIndex[i]=oben;
      n++;
    }
  }
  if( verboseMode )
    cout << "InitializeIndex gefunden fuer " << n << " Woerter.\n";
}

